home *** CD-ROM | disk | FTP | other *** search
- /*
- | | Regular Expression Evaluator:
- | |
- | | Greg Anderson
- | | 29 Kerr Hall
- | | Social Sciences Computing
- | | University of California, Santa Cruz
- | | sirkm@ssyx
- | |
- | | For use with HyperCard XCMDs, XFCNs, and possibly other things.
- | |
- | | Make this file part of your XCMD project.
- | | #include "regexp.h" in any of your files that may use routines
- | | from this package.
- | |
- | | The regular expressions this package matches is as follows:
- | |
- | | c Any ordinary character 'c' not listed below matches that
- | | character.
- | |
- | | \c A backslash (\) followed by a special character (one of
- | | '.', '*', '+', '[' and '\') matches the special character
- | | (i.e., the special meaning is removed).
- | |
- | | . A period (.) matches any single character except RETURN.
- | |
- | | [string] A non-empty string of characters enclosed in square
- | | brackets matches any single character found in the set.
- | | If the first character of such a string is ^, then
- | | any single character NOT in the set is matched. '^'
- | | looses its special meaning if it comes first in the
- | | string.
- | | The character '-' indicates a range of characters;
- | | for example, [a-z] will match any lowercase letter.
- | | '-' looses its special meaning if it comes first (or
- | | after a leading '^') or last in the string.
- | |
- | | c* Any one-character regular expression followed by a *
- | | matches zero or more occurances of the single character.
- | | If there is any choice, the longest leftmost string
- | | that matches is returned.
- | |
- | | c+ Like '*', but matches one or more occurances of the
- | | single character regular expression.
- | |
- | | ^ A caret (^) at the beginning of an entire regular
- | | expression constrains that regular expression to only
- | | match strings found at the beginning of a line.
- | |
- | | $ A currency symbol ($) at the end of an entire line
- | | constrains that regular expression to only match strings
- | | found at the end of a line.
- | |
- | | The following regular expressions are NOT supported:
- | |
- | | \< Beginning of word
- | | \> End of word
- | | \( ... \) "..." is treated as a regular expression
- | | \{n,m\} Repeated matches of previous regular expression.
- | |
- */
- #include <MacTypes.h>
- #include <FileMgr.h>
- #include "regexp.h"
-
- #define TRUE 1
- #define FALSE 0
-
- #define toupper(c) ((c>='a')&&(c<='z') ? (c-('a'-'A')) : c)
-
- int regexp_flags;
-
- /*-----------------------------------------------------------------
- | end_of_line:
- |
- | Checks to see if the given character pointer points at the end
- | of a line.
- |
- | Lines end in either a return character (\r) or a null.
- |
- | If MULTILINE is true, then logical lines may be continued on
- | multiple physical lines if succeeding physical lines are indented.
- |
- | If FOLDEDLINE is true, then logical lines may be continued on
- | multiple physical lines by preceeding each return character with
- | a backslash.
- |
- | If NOBREAKS is true, then there are no line breaks; the entire text
- | field is treated as one long line. ^ matches only at the beginning
- | of the text field, and $ matches only at the end.
- |
- | INPUTS: line_ptr: A pointer into the line
- |
- | OUTPUTS: None save the return value.
- |
- | RETURNS: TRUE End of line reached
- | FALSE Not at the end of the line
- -----------------------------------------------------------------*/
- int end_of_line(line_ptr)
- char *line_ptr;
- {
- if( !(*line_ptr) ) return(TRUE);
- if( regexp_flags & NOBREAKS ) return(FALSE);
- if( *line_ptr != '\r' ) return(FALSE);
- if( !(*(line_ptr+1)) ) return(TRUE);
-
- if( (regexp_flags & MULTILINE) && (*(line_ptr+1) <= ' ') )
- return(FALSE);
-
- if( (regexp_flags & FOLDEDLINE) && (*(line_ptr-1) == '\\') )
- return(FALSE);
-
- return(TRUE);
- }
-
- /*-----------------------------------------------------------------
- | find_regexp:
- |
- | Searches for occurances of 'regexp' inside of 'line'.
- |
- | 'regexp' must have had some prior processing--leading ^ and
- | trailing '$' should be stripped before calling. Note that
- | 'greplen' will do this preprocessing.
- |
- | INPUTS: regexp: A pointer to the regular expression
- | line: A pointer to the line to search
- | start: If zero, then 'regexp' must match 'line'
- | starting with the first character of 'line'.
- | end: If zero, then 'regexp' must also match
- | 'line' all the way to the end.
- |
- | OUTPUTS: start: If specified, start will be changed to
- | point to the first character in 'line'
- | that matched 'regexp'. If 'regexp'
- | could be matched in multiple ways
- | (due to wildcards), the leftmost string
- | is returned.
- | end: If specified, end will be changed to
- | point to the first character in 'line'
- | that was not part of 'regexp'. If
- | 'regexp' could be matched in multiple ways
- | (due to wildcards), the longest string
- | that matches is selected.
- |
- | RETURNS: TRUE 'regexp' was found in 'line'
- | FALSE 'regexp' not found--'start' and 'end' are
- | invalid.
- -----------------------------------------------------------------*/
- int find_regexp(regexp,line,start,end)
- char *regexp,
- *line,
- **start,
- **end;
- {
- if( !start )
- return( strgrep(regexp,line,end) );
-
- while( !end_of_line(line) )
- {
- if( strgrep(regexp,line,end) )
- {
- *start = line;
- return(TRUE);
- }
- ++line;
- }
- /*
- | | Special case -- searching for the end of a line and nothing else.
- */
- if( !(*regexp) && !(*end) )
- {
- *start = line;
- return(TRUE);
- }
- return(FALSE);
- }
-
- /*-----------------------------------------------------------------
- | strgrep:
- |
- | Checks to see if the regular expression 'regexp' matches the
- | search line provided. The match must be EXACT: 'line' is not
- | searched for occurances of 'regexp', it is only checked to see
- | if 'regexp' matches 'line' starting with the first character.
- | ('line' may have unmatched trailing characters, however.)
- |
- | INPUTS: regexp: A pointer to the regular expression
- | line: A pointer to the line to search
- | end: If zero, then 'regexp' must also match
- | 'line' all the way to the end.
- |
- | OUTPUTS: end: If specified, end will be changed to
- | point to the first character in 'line'
- | that was not part of 'regexp'. If
- | 'regexp' could be matched in multiple ways
- | (due to wildcards), the longest string
- | that matches is selected.
- -----------------------------------------------------------------*/
- int strgrep(regexp,line,end)
- char *regexp,
- *line,
- **end;
- {
- char *last = 0;
-
- /*
- | | Search over every character in the comparitor string
- */
- while( *regexp )
- {
- /*
- | | If we have reached the end of the line but there are
- | | still characters in the regular expression, then the
- | | search has probably failed.
- | |
- | | Wildcards in the regular expression can make things
- | | a bit trickier, though.
- */
- if( end_of_line(line) )
- {
- if( strcmp( regexp,"*" ) == 0 ) break;
- if( strcmp( regexp+1,"*" ) == 0 ) break;
- return(FALSE);
- }
-
- if( !chargrep(®exp,&line,&last) )
- {
- /*
- | | The search character does not match: if the next regular
- | | expression is not a '*', then the search has FAILED.
- */
- if( *regexp != '*' )
- return(FALSE);
- else
- {
- /*
- | | Back up the line pointer so that the same
- | | character may be checked against the next
- | | element in the regular expression string
- */
- last = 0;
- --line;
- ++regexp;
- }
- }
- }
- /*
- | | If we are searching to the END of the line, then the input
- | | line must be out of valid characters in order to return
- | | a match.
- */
- if( !end )
- return( end_of_line(line) );
-
- *end = line;
- return(TRUE);
- }
-
- /*-----------------------------------------------------------------
- | chargrep:
- |
- | Compares just one character in the regular expression
- |
- | INPUTS: All inputs are pointers to pointers to strings, as
- | chargrep will advance these pointers after comparing
- | them.
- |
- | regexp: Points into the regular expression
- | line: Points into the line being searched
- | last: Points at the last character checked in
- | the regular expression; usually = (*regexp-1).
- |
- | OUTPUTS: regexp: Advanced to the next char in the reg exp.
- | line: Advanced to the next char in search line
- | last: Set to the initial value of 'regexp'.
- -----------------------------------------------------------------*/
- int chargrep(regexp,line,last)
- char **regexp,
- **line,
- **last;
- {
- char c = **line,
- *look = *regexp;
- int match;
-
- switch( **regexp )
- {
- /*
- | | Set search?
- */
- case '[':
- *last = look;
- ++(*line);
- return( searchset(regexp,c) );
- /*
- | | '.' Wildcard matches any single character except newline / return
- | | c can only be a newline/return if one of the flags -m, -f or -b
- | | was specified.
- */
- case '.':
- if( (c != '\r') && (c != '\n') )
- c = '.';
- break;
- /*
- | | Wildcards:
- */
- case '*':
- case '+':
- /*
- | | When a wild card is found, the line is scanned
- | | until the last part of the regular expression
- | | can be found somewhere in the line.
- | |
- | | If the last part of the regular expression is
- | | found multiple times, the longest applicable
- | | match is returned.
- */
- if( !(*last) ) *last = ".";
- match = wild_scan(*regexp+1,line,*last);
- /*
- | | Fixup for '*'-style searches.
- */
- if( !match && **regexp == '*' )
- match = strgrep(*regexp+1,(*line-1),line);
- ++(*line);
- *regexp = "";
- return(match);
- /*
- | | Backslash escape: next character interpreted literally
- | |
- | | Note: Should check for \nnn (octal representation)
- */
- case '\\':
- ++(*regexp);
- break;
- }
-
- /*
- | | At this point, 'c' contains the character from the search
- | | line that must be matched in the regular expression
- | | (EXACTLY). If c does not match the regular expression,
- | | then the search still will not fail if the next character
- | | in the regexp is a '*'
- */
- if( regexp_flags & IGNORE )
- c = toupper(c);
- match = (**regexp == c);
- /*
- | | Set 'last' = the initial value of the regular expression ptr
- | | and advance the regexp and line pointers.
- */
- ++(*regexp);
- ++(*line);
- *last = look;
-
- return(match);
- }
-
- /*-----------------------------------------------------------------
- | searchset:
- |
- | Compares a [list] in the regular expression with just one
- | character in the input line.
- |
- | INPUTS: regexp: A pointer to a pointer into the regular
- | expression
- | check_c: The character to check.
- |
- | Enter with a pointer to a pointer into the regular expression
- | Upon entry, the regexp pointer should point at the '['.
- | Upon exit, it will point to the character AFTER the ']'.
- |
- | RETURNS: TRUE: 'check_c' was in the set
- | FALSE: 'check_c' was not in the set
- -----------------------------------------------------------------*/
- int searchset(regexp,check_c)
- char **regexp,
- check_c;
- {
- char c, /* The char from the set */
- lc = 0; /* The last char from set */
- int found = 0, /* Flag: found check_c? */
- invert = 0; /* Flag: inverted search */
-
- /*
- | | Advance past the '[' and check for a leading '^'
- */
- ++(*regexp);
- c = **regexp;
- if( c == '^' )
- {
- ++invert;
- ++(*regexp);
- c = **regexp;
- }
- ++(*regexp);
- do
- {
- if( regexp_flags & IGNORE )
- c = toupper(c);
- if( (c == '-') && lc )
- {
- /*
- | | Check if the character lies within a range
- */
- if( (lc <= check_c) && (**regexp >= check_c) )
- found = 1;
- lc = 0;
- }
- /*
- | | Check if this character in the regexp list matches the
- | | character being checked.
- */
- else if( c == check_c )
- found = 1;
- lc = c;
- } while( (c = *((*regexp)++) ) != ']' );
-
- return( found ^ invert );
- }
-
- /*-----------------------------------------------------------------
- | wild_scan:
- |
- | Regular expression wildcard handling. Searches for the last part
- | of a regular expression (after a wildcard) in a line.
- |
- | INPUTS: regexp: A pointer to a pointer into the regular
- | expression (points to the character after
- | the wildcard)
- | line: A pointer to a pointer into the line being
- | searched (points at the character to start
- | searching at)
- | last: A pointer to the last character in the regexp
- | before the wildcard.
- |
- | OUTPUTS: regexp: ALWAYS points to the null terminator at the
- | end of regexp.
- | line: points to the last character matched, if there
- | was a match. Otherwise unchanged.
- |
- | RETURNS: TRUE: The pattern matched; line points to the
- | first character not matched.
- | FALSE: The pattern did not match.
- -----------------------------------------------------------------*/
- wild_scan(regexp,line,last)
- char *regexp,
- **line,
- *last;
- {
- char *scan = *line,
- *copy_of_last,
- *dummy;
- int result = FALSE;
-
- while( !end_of_line(scan) )
- {
- /*
- | | If the last part of the regexp is matched at the current
- | | possition of 'scan', then remember that a match has been
- | | found and keep scanning.
- | |
- | | If (and only if) regexp is found, strgrep changes 'line' to
- | | point to the character after the last one matched by regexp.
- */
- if( strgrep(regexp,scan,line) )
- result = TRUE;
- /*
- | | If the character pointed to by scan does not match
- | | the regexp character before the wildcard, then
- | | the scan is terminated.
- */
- copy_of_last = last;
- if( !chargrep(©_of_last,&scan,&dummy) ) break;
- }
- return(result);
- }
-
- /*-----------------------------------------------------------------
- | greplen:
- |
- | Finds the length of a grep search string. In the case of
- | strings containing wild cards, returns the MINIMUM length string
- | that could match the search string.
- |
- | greplen is also responsible for finding the occurance of ^ and $
- | at the beginning and end of the string (respectively). If these
- | flags are specified, greplen notes this fact & then strips them
- | from the passed searchstring.
- |
- | If the grep search string is not valid, greplen returns -1.
- -----------------------------------------------------------------*/
- int greplen(searchstring)
- char **searchstring;
- {
- char c,
- *string;
- int len = 0;
-
- if( regexp_flags & IGNORE )
- MakeUpper(*searchstring);
-
- /*
- | | Does the search string begin with '^'?
- */
- if( **searchstring == '^' )
- {
- ++(*searchstring);
- regexp_flags |= BEGINFLAG;
- }
- string = *searchstring;
- /*
- | | Count the characters in the search string
- */
- while( c = *string++ )
- {
- switch( c )
- {
- /*
- | | Since '*' might match zero characters, the length of
- | | the string is decremented by one, since the previous
- | | character does not have to be matched.
- */
- case '*':
- if( len ) --len;
- break;
- /*
- | | If a '$' is found at the end, then set the 'END' flag.
- | | Otherwise, count the $ as a search character.
- */
- case '$':
- if( (*string) == 0 )
- {
- *(string-1) = 0;
- regexp_flags |= ENDFLAG;
- }
- else
- ++len;
- break;
- /*
- | | Scan through an entire [string], counting it as only
- | | one character. When this loop exits, string points to
- | | the ']', which will be counted in the search length on
- | | the next pass of the while() loop.
- */
- case '[':
- if( *string++ < ' ') return(-1);
- while( *string != ']' )
- if( *string++ < ' ' ) return(-1);
- break;
- /*
- | | Backslash falls through to the default case, but it
- | | first advances past the character after the backslash
- */
- case '\\':
- if( *string++ < ' ') return(-1);
- default:
- ++len;
- }
- }
-
- return(len);
- }
-